This report includes data validation, quality assessment, and preliminary analysis for a Water Quality Program’s Black Lake Pollution Identification and Correction Project. Code is included for transparency and reproducibility.

If the decision to accept or reject a value is unclear, the option most protective of public health will be chosen.

library(ggplot2)
library(dplyr)
library(tidyverse)
library(lubridate)

#Plotly
devtools::install_github("ropensci/plotly")
library(plotly)

Load Dataframe

library(RCurl)
df <- read.csv("https://raw.githubusercontent.com/AlyssaPeter/R-Code-Sample/main/EIM_Black_Lake_PIC_Project_Data.csv", stringsAsFactors = TRUE, fileEncoding = "ISO-8859-1")
df

Data Validation and QA Assessment

Check the dataframe for null values.

null_values <- sapply(df, function(x) sum(is.na(x)))
null_values
##                    Study_ID                 Location_ID 
##                           0                           0 
##  Study_Specific_Location_ID       Field_Collection_Type 
##                           0                           0 
##             Field_Collector Field_Collection_Start_Date 
##                           0                           0 
##    Field_Collection_Comment    Latitude_Decimal_Degrees 
##                           0                           0 
##   Longitude_Decimal_Degrees                   Sample_ID 
##                           0                           0 
##   Sample_Field_Replicate_ID       Sample_Replicate_Flag 
##                           0                           0 
##       Sample_Composite_Flag               Sample_Matrix 
##                           0                           0 
##               Sample_Source    Sample_Collection_Method 
##                           0                           0 
##   Sample_Preparation_Method       Result_Parameter_Name 
##                         312                           0 
## Result_Parameter_CAS_Number           Lab_Analysis_Date 
##                           0                           0 
##                Result_Value          Result_Value_Units 
##                           0                           0 
##      Result_Reporting_Limit Result_Reporting_Limit_Type 
##                           0                         312 
##      Result_Detection_Limit Result_Detection_Limit_Type 
##                           0                           0 
##       Result_Data_Qualifier           Fraction_Analyzed 
##                           0                           0 
##               Result_Method             Result_Lab_Name 
##                           0                           0

Check if Latitude and Longitude are the same per each Study_Specific_Location_ID. If the return is “TRUE”, then results are okay.

location_coords_consistency <- df %>%
  group_by(Study_Specific_Location_ID) %>%
  summarise(
    consistent_latitude = n_distinct(Latitude_Decimal_Degrees) == 1,
    consistent_longitude = n_distinct(Longitude_Decimal_Degrees) == 1
  )
location_coords_consistency
## # A tibble: 26 × 3
##    Study_Specific_Location_ID consistent_latitude consistent_longitude
##    <fct>                      <lgl>               <lgl>               
##  1 BLA001                     TRUE                TRUE                
##  2 BLA002                     TRUE                TRUE                
##  3 BLA00201                   TRUE                TRUE                
##  4 BLA00202                   TRUE                TRUE                
##  5 BLA00203                   TRUE                TRUE                
##  6 BLA00204                   TRUE                TRUE                
##  7 BLA003                     TRUE                TRUE                
##  8 BLA00301                   TRUE                TRUE                
##  9 BLA004                     TRUE                TRUE                
## 10 BLA005                     TRUE                TRUE                
## # ℹ 16 more rows

Calculate the number of QA samples taken. The result should be 0.10 or greater (at least 10% of the total).

replicate_flag_counts <- df %>%
  summarise(
    total_count = n(),
    replicate_count = sum(Sample_Replicate_Flag == "Y"),
    replicate_proportion = replicate_count / total_count
  )
replicate_flag_counts
##   total_count replicate_count replicate_proportion
## 1         312              31           0.09935897

Provide recommendation to the team to increase QA sample frequency.

Check that sample values are within expected range. If no records are returned, no values are out of expected range. Any values that are returned, please review and accept/reject accordingly.

##Check expected values
#E.coli is expected to be between 1 and 2419.6
#TP is expected to be between 0.005 and 1000
# Filter records for Total Phosphorus and E. coli that don't meet the criteria
invalid_records <- df %>%
  filter(
    (Result_Parameter_Name == "Total Phosphorus" & (Result_Value < 0.005 | Result_Value > 1000)) |
      (Result_Parameter_Name == "E. coli" & (Result_Value < 1 | Result_Value > 2420))
  )

# Display the records that fall out of bounds
invalid_records
##  [1] Study_ID                    Location_ID                
##  [3] Study_Specific_Location_ID  Field_Collection_Type      
##  [5] Field_Collector             Field_Collection_Start_Date
##  [7] Field_Collection_Comment    Latitude_Decimal_Degrees   
##  [9] Longitude_Decimal_Degrees   Sample_ID                  
## [11] Sample_Field_Replicate_ID   Sample_Replicate_Flag      
## [13] Sample_Composite_Flag       Sample_Matrix              
## [15] Sample_Source               Sample_Collection_Method   
## [17] Sample_Preparation_Method   Result_Parameter_Name      
## [19] Result_Parameter_CAS_Number Lab_Analysis_Date          
## [21] Result_Value                Result_Value_Units         
## [23] Result_Reporting_Limit      Result_Reporting_Limit_Type
## [25] Result_Detection_Limit      Result_Detection_Limit_Type
## [27] Result_Data_Qualifier       Fraction_Analyzed          
## [29] Result_Method               Result_Lab_Name            
## <0 rows> (or 0-length row.names)

Relative Percent Difference

Check the Relative Percent Difference (RPD) for E. coli and phosphorus samples.Samples outside defined thresholds may be rejected.

\[ RPD = \frac{\lvert{R1-R2}\rvert}{\frac{R1+R2}{2}}\times100 \] Total Phosphorus: TP QA sample values must be within 20% of the sample. OR if TP<0.025, sample is accepted even if they fall outside the 20% range.

# Filter dataframe by result_parameter_name = 'Total Phosphorus'
total_phosphorus_data <- df %>%
  filter(Result_Parameter_Name == "Total Phosphorus")

# Filter for Sample_Replicate_Flag == "Y"
replicate_Y_data <- total_phosphorus_data %>%
  filter(Sample_Replicate_Flag == "Y")

# Pair rows for Sample_Replicate_Flag == "Y" with Sample_Replicate_Flag == "N"
paired_data <- replicate_Y_data %>%
  left_join(total_phosphorus_data %>%
              filter(Sample_Replicate_Flag == "N"),
            by = c("Study_Specific_Location_ID", "Field_Collection_Start_Date"),
            suffix = c("_Y", "_N"))

# Calculate the relative percent difference between the two paired values
paired_data <- paired_data %>%
  mutate(relative_percent_difference = abs(Result_Value_Y - Result_Value_N) / ((Result_Value_Y + Result_Value_N) / 2) * 100)

# Return a table showing both paired values, Study_Specific_Location_ID, Field_Collection_Start_Date, Result_Value, and calculated relative percent difference values
TP_result_table <- paired_data %>%
  select(Study_Specific_Location_ID, Field_Collection_Start_Date, Result_Value_Y, Result_Value_N, relative_percent_difference)

# Print the table
print(TP_result_table)
##   Study_Specific_Location_ID Field_Collection_Start_Date Result_Value_Y
## 1                     BLA001                  12/14/2023          0.028
## 2                     BLA003                   7/12/2023          0.041
## 3                     BLA004                   7/20/2023          0.041
## 4                     BLA005                   7/25/2023          0.039
## 5                     BLA007                   9/14/2023          0.039
## 6                     BLA008                  10/12/2023          0.015
## 7                     BLA009                  10/12/2023          0.024
## 8                     BLA009                  12/15/2023          0.015
##   Result_Value_N relative_percent_difference
## 1          0.030                    6.896552
## 2          0.041                    0.000000
## 3          0.041                    0.000000
## 4          0.050                   24.719101
## 5          0.041                    5.000000
## 6          0.015                    0.000000
## 7          0.023                    4.255319
## 8             NA                          NA
# Assign a group to each row based on the average of the paired Result_Value
TP_result_table <- TP_result_table %>%
  mutate(
    group = ifelse((Result_Value_Y + Result_Value_N) / 2 < 0.025, "accepted", "evaluate"),
    highlight = ifelse(relative_percent_difference > 20, "highlight", "")
  )

# Print the table with group and highlight columns
print(TP_result_table)
##   Study_Specific_Location_ID Field_Collection_Start_Date Result_Value_Y
## 1                     BLA001                  12/14/2023          0.028
## 2                     BLA003                   7/12/2023          0.041
## 3                     BLA004                   7/20/2023          0.041
## 4                     BLA005                   7/25/2023          0.039
## 5                     BLA007                   9/14/2023          0.039
## 6                     BLA008                  10/12/2023          0.015
## 7                     BLA009                  10/12/2023          0.024
## 8                     BLA009                  12/15/2023          0.015
##   Result_Value_N relative_percent_difference    group highlight
## 1          0.030                    6.896552 evaluate          
## 2          0.041                    0.000000 evaluate          
## 3          0.041                    0.000000 evaluate          
## 4          0.050                   24.719101 evaluate highlight
## 5          0.041                    5.000000 evaluate          
## 6          0.015                    0.000000 accepted          
## 7          0.023                    4.255319 accepted          
## 8             NA                          NA     <NA>      <NA>
#Values highlighted in table are REJ

RPD for samples needs to be evaluated in context. Results returned in the table as “accepted” will be accepted regardless of RPD because the sample concentrations are close to the detection limit.

Those labeled “evaluate” should be assessed based on both the RPD value and what is known about the site heterogeneity.

BLA005, collected on 7/25/2023, does not meet quality standards and will be removed from the analysis.

E. coli: Bacteria samples with low counts tend to have higher variability. Therefore, EC sample pairs (sample and field duplicate) will be separated into two groups:

• “low count samples” where the pair mean ≤ 20 MPN/100 mL and • “higher count samples” where the pair mean > 20 MPN/100 mL.

For precision of bacteria field replicates: • 50% of the replicate pairs must be at or below 20% RPD • 90% of the pairs must be at or below 50% RPD

# Filter for E. coli
ecoli_data <- subset(df, Result_Parameter_Name == 'E.coli')

# Filter for Sample_Replicate_Flag == "Y"
replicate_Y_data <- ecoli_data %>%
  filter(Sample_Replicate_Flag == "Y")

# Pair rows for Sample_Replicate_Flag == "Y" with Sample_Replicate_Flag == "N"
paired_df <- replicate_Y_data %>%
  left_join(ecoli_data %>%
              filter(Sample_Replicate_Flag == "N"),
            by = c("Study_Specific_Location_ID", "Field_Collection_Start_Date"),
            suffix = c("_Y", "_N"))

# Calculate the relative percent difference between the two paired values
paired_df <- paired_df %>%
  mutate(relative_percent_difference = abs(Result_Value_Y - Result_Value_N) / ((Result_Value_Y + Result_Value_N) / 2) * 100)

EC_result_table <- paired_df %>%
  select(Study_Specific_Location_ID, Field_Collection_Start_Date, Result_Value_Y, Result_Value_N, relative_percent_difference)

# Return the table
EC_result_table
##    Study_Specific_Location_ID Field_Collection_Start_Date Result_Value_Y
## 1                      BLA001                  12/14/2023             15
## 2                      BLA002                   1/11/2024             49
## 3                      BLA002                   1/22/2024             51
## 4                      BLA002                   1/29/2024              3
## 5                    BLA00201                    2/5/2024             20
## 6                    BLA00203                   9/14/2023            186
## 7                    BLA00204                    1/9/2024             29
## 8                      BLA003                   7/12/2023            410
## 9                    BLA00301                   1/16/2024             93
## 10                     BLA004                   7/20/2023            365
## 11                     BLA004                   1/11/2024              8
## 12                     BLA005                   7/25/2023             63
## 13                   BLA00502                  11/27/2023              3
## 14                     BLA007                   9/14/2023            816
## 15                     BLA007                    2/5/2024            285
## 16                   BLA00702                   1/29/2024              2
## 17                   BLA00705                   1/22/2024             44
## 18                     BLA008                  10/12/2023              4
## 19                     BLA009                  10/12/2023            127
## 20                     BLA009                  12/14/2023              9
##    Result_Value_N relative_percent_difference
## 1              10                   40.000000
## 2              45                    8.510638
## 3              47                    8.163265
## 4               6                   66.666667
## 5              26                   26.086957
## 6             214                   14.000000
## 7              52                   56.790123
## 8             652                   45.574388
## 9             111                   17.647059
## 10            236                   42.928453
## 11             10                   22.222222
## 12             81                   25.000000
## 13              3                    0.000000
## 14            980                   18.262806
## 15            411                   36.206897
## 16              4                   66.666667
## 17             56                   24.000000
## 18              6                   40.000000
## 19             27                  129.870130
## 20              9                    0.000000
# Assign a group based on the average of the Result_Value_Replicate_Y and Result_Value_Replicate_N
EC_result_table$group <- ifelse((EC_result_table$Result_Value_Y + EC_result_table$Result_Value_N) / 2 <= 20,
                             "low count samples", "higher count samples")

# Calculate the percent of values with Relative_Percent_Difference less than or equal to 20 by group
percent_diff_20 <- with(EC_result_table, tapply(relative_percent_difference <= 20, group, mean) * 100)

# Calculate the percent of values with Relative_Percent_Difference less than or equal to 50 by group
percent_diff_50 <- with(EC_result_table, tapply(relative_percent_difference <= 50, group, mean) * 100)

# Combine the results into a data frame
percent_diff_results <- data.frame(
  Group = c("low count samples", "higher count samples"),
  Percent_Diff_LE_20 = c(percent_diff_20["low count samples"], percent_diff_20["higher count samples"]),
  Percent_Diff_LE_50 = c(percent_diff_50["low count samples"], percent_diff_50["higher count samples"])
)

# Return the results
percent_diff_results
##                                     Group Percent_Diff_LE_20 Percent_Diff_LE_50
## low count samples       low count samples           28.57143           71.42857
## higher count samples higher count samples           38.46154           84.61538
# • 50% of the replicate pairs must be at or below 20% RPD 
# • 90% of the pairs must be at or below 50% RPD

The replicate sample for BLA009 on 10/12/2023 stands out as an anomaly with an RPD of 129%. This sample will be removed from the analysis as the sample is very likely not representative of the system.

50% of replicate pairs must be at or below 20% RPD. Currently, 40% of low count samples area and 37.5% of high count samples are.

90% of replicate pairs must be at or below 50% RPD. Currently, 100% of low count samples are and 75% of high count samples are.

Low count samples with non-anomalous larger RPD values will be included in analyses for the time being because the data set is currently incomplete and when more replicate samples have been collected, the percent of samples meeting the thresholds may meet quality standards. A final determination will be made when all data have been collected and the project concluded in 2025.

Analysis should be re-run with the corrected dataframe to see how quality assessment improves.

Data Preparation

Now that the QA Assessment has run, data is prepped for analysis. Remove all QA records and any other rejected records:

# Create a new dataframe with the specified conditions
new_df <- df %>%
  filter(Sample_Replicate_Flag != "Y") %>%
  filter(!(Study_Specific_Location_ID == "BLA005" & Field_Collection_Start_Date == "7/25/2023")) %>%
  filter(!(Study_Specific_Location_ID == "BLA009" & Field_Collection_Start_Date == "10/12/2023"))

Define the date field so it may be recognized as a linear measure of time:

##Convert date class
new_df$Field_Collection_Start_Date
new_df <- new_df %>% 
  mutate(Field_Collection_Start_Date = mdy(Field_Collection_Start_Date))
head(new_df)
new_df$Field_Collection_Start_Date

Add a new field, “Season”, defined by the Field_Collection_Start_Date:

new_df$Season <- ifelse(format(as.Date(new_df$Field_Collection_Start_Date), "%m") %in% c("05", "06", "07", "08", "09"), "Dry", "Wet")

new_df

Save the prepped dataset as an object

save(new_df, file="J:/Git_WS/R-Code-Sample/Inputs/new_df.rda")
write.csv(new_df, file="O:/EH_Health/Surface Water/+ PIC/Projects/Black Lake Grant 2022-25/Data/Black_Lake_Data_Prepped.csv", row.names=FALSE)

Data Analysis

Look at numeric distributions:

# Plot numeric distributions of Result_Value by Result_Parameter_Name
ggplot(new_df, aes(x = Result_Value)) +
  geom_histogram(bins = 30) +
  facet_wrap(~Result_Parameter_Name, scales = "free_x") +
  theme_minimal() +
  xlab("Result Value") +
  ylab("Frequency") +
  ggtitle("Distributions of Result Value by Parameter Name")

##check normality
# Check the normality of Result_Value by Result_Parameter_Name using QQ plots
ggplot(new_df, aes(sample = Result_Value)) +
  geom_qq() +
  geom_qq_line() +
  facet_wrap(~Result_Parameter_Name) +
  theme_minimal() +
  xlab("Theoretical Quantiles") +
  ylab("Sample Quantiles") +
  ggtitle("QQ Plots of Result Value by Parameter Name")

Create a boxplot using Plotly for E. coli. The plot is interactive.

## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels

## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
## Warning: 'layout' objects don't have these attributes: 'boxmode'
## Valid attributes include:
## '_deprecated', 'activeshape', 'annotations', 'autosize', 'autotypenumbers', 'calendar', 'clickmode', 'coloraxis', 'colorscale', 'colorway', 'computed', 'datarevision', 'dragmode', 'editrevision', 'editType', 'font', 'geo', 'grid', 'height', 'hidesources', 'hoverdistance', 'hoverlabel', 'hovermode', 'images', 'legend', 'mapbox', 'margin', 'meta', 'metasrc', 'modebar', 'newshape', 'paper_bgcolor', 'plot_bgcolor', 'polar', 'scene', 'selectdirection', 'selectionrevision', 'separators', 'shapes', 'showlegend', 'sliders', 'smith', 'spikedistance', 'template', 'ternary', 'title', 'transition', 'uirevision', 'uniformtext', 'updatemenus', 'width', 'xaxis', 'yaxis', 'barmode', 'bargap', 'mapType'

Create a boxplot using Plotly for Total Phosphorus

## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels

## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
## Warning: 'layout' objects don't have these attributes: 'boxmode'
## Valid attributes include:
## '_deprecated', 'activeshape', 'annotations', 'autosize', 'autotypenumbers', 'calendar', 'clickmode', 'coloraxis', 'colorscale', 'colorway', 'computed', 'datarevision', 'dragmode', 'editrevision', 'editType', 'font', 'geo', 'grid', 'height', 'hidesources', 'hoverdistance', 'hoverlabel', 'hovermode', 'images', 'legend', 'mapbox', 'margin', 'meta', 'metasrc', 'modebar', 'newshape', 'paper_bgcolor', 'plot_bgcolor', 'polar', 'scene', 'selectdirection', 'selectionrevision', 'separators', 'shapes', 'showlegend', 'sliders', 'smith', 'spikedistance', 'template', 'ternary', 'title', 'transition', 'uirevision', 'uniformtext', 'updatemenus', 'width', 'xaxis', 'yaxis', 'barmode', 'bargap', 'mapType'

Create plots for each E.coli and Total Phosphorus results over time. The plots are interactive and you can click on the series to the right to hide or show them. Static pngs are also available for download.

Calculate some preliminary statistics.

Return the number of samples taken by site:

#how many samples were taken?
nrow(new_df)
## [1] 277
#250

# Make table summary
summary <- df %>%
  summarise(
    `Total number E. coli Samples Taken` = sum(Result_Parameter_Name == "E.coli"),
    `Total number of Total Phosphorus Samples Taken` = sum(Result_Parameter_Name == "Total Phosphorus"),
    `Number E. coli QA Samples` = sum(Result_Parameter_Name == "E.coli" & Sample_Replicate_Flag == "Y"),
    `Number TP QA Samples` = sum(Result_Parameter_Name == "Total Phosphorus" & Sample_Replicate_Flag == "Y")
  )

summary
##   Total number E. coli Samples Taken
## 1                                204
##   Total number of Total Phosphorus Samples Taken Number E. coli QA Samples
## 1                                             88                        20
##   Number TP QA Samples
## 1                    8

Calculate the geomean by routine site and season for E. coli:

#Remove all Segmentation samples and filter for E. coli only
df_filtered <- new_df %>%
  filter(Field_Collection_Comment != "Segmentation", Result_Parameter_Name == "E.coli")

# Filter for "Wet" and "Dry" seasons only
df_season_filtered <- df_filtered %>%
  filter(Season %in% c("Wet", "Dry"))

# Define a function to calculate the geometric mean
geomean <- function(x) exp(mean(log(x), na.rm = TRUE))

# Aggregate by season and study_specific_location_id and calculate the geomean and counts
aggregated_df <- df_season_filtered %>%
  group_by(Study_Specific_Location_ID, Season, Latitude_Decimal_Degrees, Longitude_Decimal_Degrees) %>%
  summarise(
    Geomean = geomean(Result_Value),
    `Total Number of Samples` = n(),
    `% of Samples above 100` = sum(Result_Value >= 100) / n() * 100,
    `% of Samples above 320` = sum(Result_Value >= 320) / n() * 100,
    .groups = 'drop'
  )

aggregated_df
## # A tibble: 18 × 8
##    Study_Specific_Locatio…¹ Season Latitude_Decimal_Deg…² Longitude_Decimal_De…³
##    <fct>                    <chr>                   <dbl> <fct>                 
##  1 BLA001                   Dry                      47.0 -122.971982           
##  2 BLA001                   Wet                      47.0 -122.971982           
##  3 BLA002                   Dry                      47.0 -122.964812           
##  4 BLA002                   Wet                      47.0 -122.964812           
##  5 BLA003                   Dry                      47.0 -122.96155            
##  6 BLA003                   Wet                      47.0 -122.96155            
##  7 BLA004                   Dry                      47.0 -122.975222           
##  8 BLA004                   Wet                      47.0 -122.975222           
##  9 BLA005                   Dry                      47.0 -122.9714             
## 10 BLA005                   Wet                      47.0 -122.9714             
## 11 BLA006                   Dry                      47.0 -122.97618            
## 12 BLA006                   Wet                      47.0 -122.97618            
## 13 BLA007                   Dry                      47.0 -122.9882787          
## 14 BLA007                   Wet                      47.0 -122.9882787          
## 15 BLA008                   Dry                      47.0 -122.965046           
## 16 BLA008                   Wet                      47.0 -122.965046           
## 17 BLA009                   Dry                      47.0 -122.984555           
## 18 BLA009                   Wet                      47.0 -122.984555           
## # ℹ abbreviated names: ¹​Study_Specific_Location_ID, ²​Latitude_Decimal_Degrees,
## #   ³​Longitude_Decimal_Degrees
## # ℹ 4 more variables: Geomean <dbl>, `Total Number of Samples` <int>,
## #   `% of Samples above 100` <dbl>, `% of Samples above 320` <dbl>

Calculate the geomean by routine site and season for Total Phosphorus:

df_tp <- new_df %>%
  filter(Field_Collection_Comment != "Segmentation", Result_Parameter_Name == "Total Phosphorus")

# Filter for "Wet" and "Dry" seasons only
df_season_tp <- df_tp %>%
  filter(Season %in% c("Wet", "Dry"))

# Define a function to calculate the geometric mean
geomean <- function(x) exp(mean(log(x), na.rm = TRUE))

# Aggregate by season and study_specific_location_id and calculate the geomean and counts
aggregated_df_tp <- df_season_tp %>%
  group_by(Study_Specific_Location_ID, Season, Latitude_Decimal_Degrees, Longitude_Decimal_Degrees) %>%
  summarise(
    Geomean = geomean(Result_Value),
    `Total Number of Samples` = n(),
    `% of Samples above 0.01mg/L` = sum(Result_Value >= 0.01) / n() * 100,
    .groups = 'drop'
  )

aggregated_df_tp
## # A tibble: 18 × 7
##    Study_Specific_Locatio…¹ Season Latitude_Decimal_Deg…² Longitude_Decimal_De…³
##    <fct>                    <chr>                   <dbl> <fct>                 
##  1 BLA001                   Dry                      47.0 -122.971982           
##  2 BLA001                   Wet                      47.0 -122.971982           
##  3 BLA002                   Dry                      47.0 -122.964812           
##  4 BLA002                   Wet                      47.0 -122.964812           
##  5 BLA003                   Dry                      47.0 -122.96155            
##  6 BLA003                   Wet                      47.0 -122.96155            
##  7 BLA004                   Dry                      47.0 -122.975222           
##  8 BLA004                   Wet                      47.0 -122.975222           
##  9 BLA005                   Dry                      47.0 -122.9714             
## 10 BLA005                   Wet                      47.0 -122.9714             
## 11 BLA006                   Dry                      47.0 -122.97618            
## 12 BLA006                   Wet                      47.0 -122.97618            
## 13 BLA007                   Dry                      47.0 -122.9882787          
## 14 BLA007                   Wet                      47.0 -122.9882787          
## 15 BLA008                   Dry                      47.0 -122.965046           
## 16 BLA008                   Wet                      47.0 -122.965046           
## 17 BLA009                   Dry                      47.0 -122.984555           
## 18 BLA009                   Wet                      47.0 -122.984555           
## # ℹ abbreviated names: ¹​Study_Specific_Location_ID, ²​Latitude_Decimal_Degrees,
## #   ³​Longitude_Decimal_Degrees
## # ℹ 3 more variables: Geomean <dbl>, `Total Number of Samples` <int>,
## #   `% of Samples above 0.01mg/L` <dbl>
#add Result_parameter_name column
aggregated_df_tp$Result_Parameter_Name <- "Total Phosphorus"

#add Meets_Standards column
aggregated_df_tp$`Meets_Standards` <- ifelse(aggregated_df_tp$Geomean < 0.01, "Yes", "No")

Calculate E. coli geomean for segmented sites by season:

#Repeat for segmented sites
df_seg <- new_df %>%
  filter(Field_Collection_Comment == "Segmentation", Result_Parameter_Name == "E.coli")

# Filter for "Wet" and "Dry" seasons only
df_seg_filtered <- df_seg %>%
  filter(Season %in% c("Wet", "Dry"))

# Define a function to calculate the geometric mean
geomean <- function(x) exp(mean(log(x), na.rm = TRUE))

# Aggregate by season and study_specific_location_id and calculate the geomean and counts
aggregated_df_seg <- df_seg_filtered %>%
  group_by(Study_Specific_Location_ID, Season, Latitude_Decimal_Degrees, Longitude_Decimal_Degrees) %>%
  summarise(
    Geomean = geomean(Result_Value),
    `Total Number of Samples` = n(),
    `% of Samples above 100` = sum(Result_Value >= 100) / n() * 100,
    `% of Samples above 320` = sum(Result_Value >= 320) / n() * 100,
    .groups = 'drop'
  )

aggregated_df_seg
## # A tibble: 28 × 8
##    Study_Specific_Locatio…¹ Season Latitude_Decimal_Deg…² Longitude_Decimal_De…³
##    <fct>                    <chr>                   <dbl> <fct>                 
##  1 BLA002                   Dry                      47.0 -122.964812           
##  2 BLA002                   Wet                      47.0 -122.964812           
##  3 BLA00201                 Dry                      47.0 -122.97224            
##  4 BLA00201                 Wet                      47.0 -122.97224            
##  5 BLA00202                 Dry                      47.0 -122.96935            
##  6 BLA00202                 Wet                      47.0 -122.96935            
##  7 BLA00203                 Dry                      47.0 -122.96175            
##  8 BLA00203                 Wet                      47.0 -122.96175            
##  9 BLA00204                 Dry                      47.0 -122.957002           
## 10 BLA00204                 Wet                      47.0 -122.957002           
## # ℹ 18 more rows
## # ℹ abbreviated names: ¹​Study_Specific_Location_ID, ²​Latitude_Decimal_Degrees,
## #   ³​Longitude_Decimal_Degrees
## # ℹ 4 more variables: Geomean <dbl>, `Total Number of Samples` <int>,
## #   `% of Samples above 100` <dbl>, `% of Samples above 320` <dbl>

Now merge geomean tables into a dataframe to be used in the dashboard and save it as a csv

#Merge E.coli tables
Black_Lake_Geomean_Table <- rbind(aggregated_df, aggregated_df_seg)

#Add columns "Result_Parameter_Name" and "Meets standards"
Black_Lake_Geomean_Table$Result_Parameter_Name <- "E.coli"
Black_Lake_Geomean_Table$`Meets_Standards` <- ifelse(Black_Lake_Geomean_Table$Geomean < 100 | Black_Lake_Geomean_Table$`% of Samples above 320` < 10, "Yes", "No")

#Merge TP table
Black_Lake_Geomean_Table <-merge(Black_Lake_Geomean_Table, aggregated_df_tp, all.x = TRUE)
Black_Lake_Geomean_Table <- bind_rows(Black_Lake_Geomean_Table, aggregated_df_tp)

#Export to csv
write.csv(Black_Lake_Geomean_Table, file="O:/EH_Health/Surface Water/+ PIC/Projects/Black Lake Grant 2022-25/Data/Support Data/Black_Lake_Geomean_Table.csv", row.names=FALSE)